In [1]:
import wandb
import pandas as pd
import ydata_profiling
/Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm /Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/numba/core/decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement. warnings.warn(msg, NumbaDeprecationWarning) /Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/visions/backends/shared/nan_handling.py:50: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details. @nb.jit
In [2]:
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
wandb: Currently logged in as: thierrygrimm. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.15.8
Run data is saved locally in
/Users/thierrygrimm/Library/CloudStorage/OneDrive-Persönlich/Machine Learning/Udacity projects/ml-pipeline-rental-prices/src/eda/wandb/run-20230802_120248-trr8n7pp
View project at https://wandb.ai/thierrygrimm/nyc_airbnb
In [3]:
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)
In [4]:
df.head()
Out[4]:
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9138664 | Private Lg Room 15 min to Manhattan | 47594947 | Iris | Queens | Sunnyside | 40.74271 | -73.92493 | Private room | 74 | 2 | 6 | 2019-05-26 | 0.13 | 1 | 5 |
| 1 | 31444015 | TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN... | 8523790 | Johlex | Manhattan | Hell's Kitchen | 40.76682 | -73.98878 | Entire home/apt | 170 | 3 | 0 | NaN | NaN | 1 | 188 |
| 2 | 8741020 | Voted #1 Location Quintessential 1BR W Village... | 45854238 | John | Manhattan | West Village | 40.73631 | -74.00611 | Entire home/apt | 245 | 3 | 51 | 2018-09-19 | 1.12 | 1 | 0 |
| 3 | 34602077 | Spacious 1 bedroom apartment 15min from Manhattan | 261055465 | Regan | Queens | Astoria | 40.76424 | -73.92351 | Entire home/apt | 125 | 3 | 1 | 2019-05-24 | 0.65 | 1 | 13 |
| 4 | 23203149 | Big beautiful bedroom in huge Bushwick apartment | 143460 | Megan | Brooklyn | Bushwick | 40.69839 | -73.92044 | Private room | 65 | 2 | 8 | 2019-06-23 | 0.52 | 2 | 8 |
In [5]:
profile = ydata_profiling.ProfileReport(df)
In [8]:
profile.to_notebook_iframe()
Summarize dataset: 100%|█████| 126/126 [00:06<00:00, 19.44it/s, Completed] Generate report structure: 100%|████████████| 1/1 [00:03<00:00, 3.21s/it] Render HTML: 100%|██████████████████████████| 1/1 [00:01<00:00, 1.18s/it]
In [9]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 19001 entries, 0 to 19999 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 19001 non-null int64 1 name 18994 non-null object 2 host_id 19001 non-null int64 3 host_name 18993 non-null object 4 neighbourhood_group 19001 non-null object 5 neighbourhood 19001 non-null object 6 latitude 19001 non-null float64 7 longitude 19001 non-null float64 8 room_type 19001 non-null object 9 price 19001 non-null int64 10 minimum_nights 19001 non-null int64 11 number_of_reviews 19001 non-null int64 12 last_review 15243 non-null datetime64[ns] 13 reviews_per_month 15243 non-null float64 14 calculated_host_listings_count 19001 non-null int64 15 availability_365 19001 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(7), object(5) memory usage: 2.5+ MB
In [ ]:
run.finish()